Convert to dc.js Friendly CSV


In [1]:
import sys

In [2]:
sys.path.append('/mnt/home/ubuntu/projects/tools/')

In [3]:
import sys,re,json,os,csv,glob
import numpy as np
import matplotlib.pyplot as plt
from dateutil.parser import parse
import datetime,time,random,traceback
from geopy import distance
import geolocator
geo=geolocator.Geolocator()
geo.init()


WARNING:geopy:BeautifulSoup was not found. The SemanticMediaWiki geocoder will not work.
Loading the world...
Oh, the world is already out there...

In [4]:
files=glob.glob('../data/2014-*/*json')
files.sort()
print len(files)
print files[0]


6220
../data/2014-06/DataSift-b44470c030b631466ab6261723158109-1413565092.json
documents=[] for file in files: # Cycle through files fileString=open(file,'r').read().decode('utf-8') # Read file as one long string and convert to unicode fileDocuments=[json.loads(line) for line in fileString.split('\n')] fileDocuments=[d for d in fileDocuments if d['interaction']['type'] in ['Twitter']] # Split into lines and load as JSON documents.extend(fileTweets) # Add list of tweets from file to global list print len(documents)
files = None
tweets=[] for document in documents: fileTweets=[t for t in documents if t['interaction']['tag_tree']['topic'].keys()[0] in ['Discrimination', 'Prevention']] tweets.extend(fileTweets) print len(tweets)

In [5]:
tweets=[]
for file in files:
# Cycle through files
    fileString=open(file,'r').read().decode('utf-8')
    # Read file as one long string and convert to uniicode
    fileDocs=[json.loads(line) for line in fileString.split('\n')]
    fileDocs=[d for d in fileDocs if d['interaction']['tag_tree']['topic'].keys()[0] in ['Discrimination', 'Prevention']]
    fileTweets=[t for t in fileDocs if t['interaction']['type'] in ['twitter']]
    # Split into lines and load as JSON
    tweets.extend(fileTweets)
    # Add list of tweets from file to global list
print len(tweets)


428706

In [4]:
nTime=0
nId=0
nCity=0
# For counting errors
cities=['Belo Horizonte', u'Brasília, Brasilia', u'Cuiabá', 'Curitiba', 'Fortaleza', 'Manaus', 'Natal, Rio Grande do Norte', 
        'Porto Alegre', 'Recife', 'Rio de Janeiro', 'Salvador, Bahia', u'São Paulo', 'Rio Branco, Acre', u'Maceió', u'Macapá',
        u'Vitória, Espírito Santo', u'Goiânia', u'São Luís, Maranhão', 'Campo Grande, Mato Grosso do Sul', u'Belém, Pará',
        u'João Pessoa, Paraíba', u'Teresina, Piauí', u'Porto Velho, Rondônia', 'Boa Vista, Roraima', u'Florianópolis',
        'Aracaju, Sergipe', 'Palmas, Tocantins']
# Define cities to 'snap' coords to
coords=[]
coords=[geo.geoLocate(c)[0][1:3] for c in cities]
# Get coords from geolocator
tolerance=120
# Set tolerance to snap locations to nearest cities, in KM

In [6]:
outFile=csv.writer(open('cities.csv','w'),delimiter='\t')
for i,j in zip(cities,coords):
    print i,j


Belo Horizonte (-19.92623, -43.93982)
Brasília, Brasilia ('-15.79159', '-47.89558')
Cuiabá (-15.41924, -55.89023)
Curitiba (-25.50395, -49.29082)
Fortaleza (-3.72271, -38.52465)
Manaus (-3.04361, -60.01282)
Natal, Rio Grande do Norte ('-5.795', '-35.20944')
Porto Alegre (-30.11462, -51.16393)
Recife (-8.01175, -34.95291)
Rio de Janeiro (-22.0, -42.5)
Salvador, Bahia (-12.97177, -38.50811)
São Paulo (-22.0, -49.0)
Rio Branco, Acre ('-9.97472', '-67.81')
Maceió (-9.66583, -35.73528)
Macapá (0.59873, -50.76849)
Vitória, Espírito Santo ('-20.29048', '-40.28808')
Goiânia (-16.64019, -49.25993)
São Luís, Maranhão ('-2.64949', '-44.30441')
Campo Grande, Mato Grosso do Sul ('-20.44278', '-54.64639')
Belém, Pará ('-1.34341', '-48.41816')
João Pessoa, Paraíba ('-7.17088', '-34.86536')
Teresina, Piauí ('-5.08917', '-42.80194')
Porto Velho, Rondônia ('-8.76194', '-63.90389')
Boa Vista, Roraima ('2.81972', '-60.67333')
Florianópolis (-27.61455, -48.50116)
Aracaju, Sergipe ('-10.91111', '-37.07167')
Palmas, Tocantins ('-10.21278', '-48.36028')

In [7]:
print tweets[11]


{u'twitter': {u'lang': u'pt', u'source': u'<a href="https://twitter.com/download/android" rel="nofollow">Twitter for  Android</a>', u'text': u'Nao curto ropa apertada isso e coisa de boiola,.so mais minha peita.gigante, meu bone e meu board debaixo do pe', u'created_at': u'Thu, 19 Jun 2014 06:06:06 +0000', u'filter_level': u'medium', u'user': {u'lang': u'pt', u'created_at': u'Thu, 07 Jun 2012 05:29:41 +0000', u'utc_offset': -10800, u'id_str': u'601583939', u'statuses_count': 39938, u'name': u'Fabio Andrey  \u270c', u'friends_count': 511, u'profile_image_url_https': u'https://pbs.twimg.com/profile_images/468547366929567744/GmukbbI6_normal.jpeg', u'time_zone': u'Brasilia', u'profile_image_url': u'http://pbs.twimg.com/profile_images/468547366929567744/GmukbbI6_normal.jpeg', u'followers_count': 788, u'screen_name': u'Sou_Nigga', u'location': u'sao leo', u'favourites_count': 5178, u'verified': False, u'geo_enabled': True, u'listed_count': 1, u'id': 601583939, u'description': u'Bibi'}, u'id': u'479505381132345344'}, u'interaction': {u'author': {u'username': u'Sou_Nigga', u'name': u'Fabio Andrey  \u270c', u'language': u'pt', u'link': u'http://twitter.com/Sou_Nigga', u'avatar': u'http://pbs.twimg.com/profile_images/468547366929567744/GmukbbI6_normal.jpeg', u'id': 601583939}, u'created_at': u'Thu, 19 Jun 2014 06:06:06 +0000', u'tag_tree': {u'topic': {u'Discrimination': [u'Negative']}}, u'content': u'Nao curto ropa apertada isso e coisa de boiola,.so mais minha peita.gigante, meu bone e meu board debaixo do pe', u'source': u'Twitter for  Android', u'link': u'http://twitter.com/Sou_Nigga/status/479505381132345344', u'received_at': 1403157966.5953, u'type': u'twitter', u'id': u'1e3f777ccc6dab00e074c042563137a0', u'schema': {u'version': 3}}, u'demographic': {u'gender': u'male'}, u'language': {u'confidence': 99, u'tag': u'pt', u'tag_extended': u'pt'}, u'klout': {u'score': 44}}

In [8]:
def getClosestCity(tCoords):
  '''Takes tuple of coordinates, cycles through cities 
     in global variable <cities>, reads their coords from
     global variable <coords> and returns closest
     ------
     returns tuple of coords of closest city,city name
     OR None, if no city within tolerance'''
  dist=999999
  closest='ZZZZ'
  cCoords=[]
  for c,cc in enumerate(cities):
    cDist=distance.distance(tCoords,coords[c])
    if cDist<dist:
      dist=cDist
      closest=cc
      cCoords=coords[c]
  if dist<tolerance:
    return cCoords,closest
  else:
    return None

In [9]:
import gender
g=gender.Gender()
g.gender(tweets[1]['interaction']['author']['name'])


Out[9]:
{u'VALENTINE': {'gender': 'mm',
  'probability': 0.7840717162530856,
  'volume_female': 1662.0,
  'volume_male': 6035.0}}
def mungeDate(dummyTime): '''Takes Twitter timestamp ------ returns iso format timestamp -> YYY-MM-DD hh:mm:ss ''' # Get from this format: Thu, 02 Jan 2014 16:26:15 +0000... timeStruct=time.strptime(dummyTime,'%a, %d %b %Y %H:%M:%S +0000') # Gets list with date/time components return str(timeStruct[0])+'-'+str(timeStruct[1])+'-'+str(timeStruct[2])+' '+str(timeStruct[3])+':'+str(timeStruct[4])+':'+str(timeStruct[5]) # ...into this format mm/DD/YYYYYYY-MM-DD hh:mm:ss

In [10]:
def mungeDate(dummyTime):
  '''Takes Twitter timestamp
     ------
     returns iso format timestamp -> YYY-MM-DD hh:mm:ss
  '''
  # Get from this format: Thu, 02 Jan 2014 16:26:15 +0000...
  timeStruct=datetime.datetime.strptime(dummyTime,'%a, %d %b %Y %H:%M:%S +0000')
  # Gets list with date/time components
  return timeStruct
  # ...into this format mm/DD/YYYYYYY-MM-DD hh:mm:ss

In [11]:
print coords
print coords[cities.index(u'São Paulo')]
getClosestCity(coords[cities.index(u'São Paulo')])


[(-19.92623, -43.93982), ('-15.79159', '-47.89558'), (-15.41924, -55.89023), (-25.50395, -49.29082), (-3.72271, -38.52465), (-3.04361, -60.01282), ('-5.795', '-35.20944'), (-30.11462, -51.16393), (-8.01175, -34.95291), (-22.0, -42.5), (-12.97177, -38.50811), (-22.0, -49.0), ('-9.97472', '-67.81'), (-9.66583, -35.73528), (0.59873, -50.76849), ('-20.29048', '-40.28808'), (-16.64019, -49.25993), ('-2.64949', '-44.30441'), ('-20.44278', '-54.64639'), ('-1.34341', '-48.41816'), ('-7.17088', '-34.86536'), ('-5.08917', '-42.80194'), ('-8.76194', '-63.90389'), ('2.81972', '-60.67333'), (-27.61455, -48.50116), ('-10.91111', '-37.07167'), ('-10.21278', '-48.36028')]
(-22.0, -49.0)
Out[11]:
((-22.0, -49.0), u'S\xe3o Paulo')

In [12]:
outFile=csv.writer(open('../data/all.csv','w'))
# Open output file
nTime=nId=nCity=nRange=nCategory=nSubCategory=nTopic=0
# Reset error counters

outFile.writerow(['city','lat','lon','origdate','topic']) 

for t,tweet in enumerate(tweets):
  cityCoords=None
  try:
    tTime=tweet['interaction']['created_at']
  except:
    nTime+=1
  try:
    id=tweet['interaction']['id']
  except:
    nId+=1
  try:
    category=tweet['interaction']['tag_tree']['topic'].keys()[0]
  except:
    nCategory+=1
  try:
    subCategory=tweet['interaction']['tag_tree']['topic'].values()[0][0]
  except:
    nSubCategory+=1
  try:
    topic = category + "_" + subCategory
  except:
    nTopic+=1
  if 'geo' in tweet['twitter'].keys():
    res=getClosestCity([tweet['twitter']['geo']['latitude'],tweet['twitter']['geo']['longitude']])   
    if res:
    # If location doesn't snap to chosen cities, within tolerance, then throw away
      (cityCoords,city)=res
      outFile.writerow([city.partition(',')[0].encode("utf-8"),cityCoords[0],cityCoords[1],mungeDate(tTime),topic])
    else:
      nRange+=1
  else:
    nCity+=1
    # print tweet
    # print 'FAILING...'
    # print tweet.keys()
    # sys.exit(1)
    # All these tweets should have lat/long, if not stop and find out why
print nTime,nId,nCity,nRange,nCategory,nSubCategory,nTopic


0 0 405176 14140 0 0 0

In [13]:
!head ../data/all.csv
!wc ../data/all.csv


city,lat,lon,origdate,topic
Porto Alegre,-30.11462,-51.16393,2014-06-19 06:01:11,Prevention_Positive
Fortaleza,-3.72271,-38.52465,2014-06-19 09:06:28,Discrimination_Negative
Recife,-8.01175,-34.95291,2014-06-19 00:22:09,Discrimination_Negative
Brasília,-15.79159,-47.89558,2014-06-19 02:07:21,Discrimination_Negative
Fortaleza,-3.72271,-38.52465,2014-06-19 23:55:34,Discrimination_Negative
Salvador,-12.97177,-38.50811,2014-06-19 00:02:22,Prevention_Positive
Rio de Janeiro,-22.0,-42.5,2014-06-19 00:27:53,Discrimination_Negative
Porto Alegre,-30.11462,-51.16393,2014-06-19 02:33:42,Discrimination_Negative
Belo Horizonte,-19.92623,-43.93982,2014-06-19 03:19:31,Discrimination_Negative
  9371  24351 700416 ../data/all.csv

In [1]:
from IPython.core.display import HTML
styles = open("../css/custom.css", "r").read()
HTML(styles)


Out[1]:

In [14]: